In [1]:
import pandas as pd
import numpy as np
import datetime
import bokeh
from bokeh.plotting import *

output_notebook()
BokehJS successfully loaded.

Processing OBC (On-Board Computer) Data

Most of the data from the balloon flight was logged from programs running on a satellite on board computer (OBC). This data was encoded in a binary format for better compression, then decoded using the decode_binary.py utility (see the Ardusat SDK discussion on binary data: https://github.com/ArduSat/ArduSatSDK#binary-data-format).

This decoding produces a text file with CSV formatted lines that looks like this:

1428160594,tmp102,0,242,248
1428160594,tsl2561,7,4941,4941
1428160594,l3gd20,10,17,7,-157,1
1428160594,lsm303_mag,12,-1765,2262,-2656,32
1428160594,lsm303_accel,11,-320,-1831,3530,24
1428160594,mlx90614,5,14209,14411
1428160594,tmp102,0,241,247
1428160594,tsl2561,7,4941,4941
1428160594,l3gd20,10,-62,91,-126,1

Each line corresponds to a reading from a particular sensor. However, we would rather have a CSV file organized so that each row is a timestamp and all the sensor values from that timestamp are columns. To do that, we'll use Python Pandas and python built-in functions to split out each sensor's rows into a separate collection, then add everything together into one DataFrame that we can use to represent the entire dataset and write to a CSV file.

While we're at it, we will parse the datatypes to form timestamp and number objects for each value in the data.

The timestamps here are in a standard time format in computer science called Unix epoch (also known as POSIX Time). It is defined as the number of seconds elapsed since January 1, 1970. See http://en.wikipedia.org/wiki/Unix_time for more details.

In [2]:
# Go through the dataset line by line and create a list of data rows for each 
# sensor
from collections import defaultdict
import csv

data = defaultdict(list)
with open("./ase_hab/ardusat_full.csv", "r") as data_file:
    for row in csv.reader(data_file):
         data[row[1]].append(tuple([v for i, v in enumerate(row) if i != 1]))
In [3]:
def fix_timeseries(df):
    """ 
    Fixes up the timeseries by converting types, adding half second intervals to every 
    other data obs, dropping NaT values, and setting the DF index
    
    The half-second intervals are necessary because this dataset was logging every 500ms
    (1/2 second), but the raw timestamp data only shows the timestamp to the nearest second
    """
    ts = pd.to_datetime(df.timestamp.astype(int), unit='s')
    ts = pd.concat((ts[0:-1:2], ts[1:-1:2] + datetime.timedelta(milliseconds=500)))
    ts.sort()
    df.timestamp = ts
    df = df.dropna()
    return df.set_index('timestamp')

Ambient Temperature Data (TMP102)

In [4]:
# Create a Pandas DataFrame Object (http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.html)
# for all the rows from the ambient temperature data
tmp_data = pd.DataFrame.from_records(data['tmp102'], 
                                     columns=['timestamp', 'type', 'tmp1_raw', 'tmp2_raw'])
tmp_data = fix_timeseries(tmp_data)
In [5]:
# Convert temperature to integer types
tmp_data.tmp1_raw = tmp_data.tmp1_raw.astype(int)
tmp_data.tmp2_raw = tmp_data.tmp2_raw.astype(int)

# correct for unsigned binary storage (bug in binary data storage)
raw_cols = ['tmp1_raw', 'tmp2_raw']
tmp_data[raw_cols] = tmp_data[raw_cols] - (tmp_data[raw_cols] > 0x888).astype(int) * 0xFFF

# Convert raw temp values to engineering values
tmp_data[['tmp1', 'tmp2']] = tmp_data[['tmp1_raw', 'tmp2_raw']] * 0.0625

Luminosity Data (TSL2561)

In [6]:
tsl2561_data = pd.DataFrame.from_records(data['tsl2561'], 
                                         columns=['timestamp', 'type', 'full_raw', 'tsl_ir_raw'])
tsl2561_data = fix_timeseries(tsl2561_data)
In [7]:
# calculate luminosity from the raw full spectrum and IR values
# This calculation is fairly complicated - see the TSL2561 datasheet
# for more details: http://www.adafruit.com/datasheets/TSL2561.pdf

# Convert to integer types
raw_cols = ['full_raw', 'tsl_ir_raw']
tsl2561_data[raw_cols] = tsl2561_data[raw_cols].astype(int)

# Start the luminosity calculation
channel_scale = 0x7517
channel_scale = channel_scale << 4

channel_zero = np.right_shift(tsl2561_data.full_raw * channel_scale, 10)
channel_one = np.right_shift(tsl2561_data.tsl_ir_raw * channel_scale, 10)

ratio1 = np.zeros(channel_zero.shape, dtype=int)
idx = (channel_zero != 0).values
ratio1[idx] = np.left_shift(channel_one[idx], 10) / channel_zero[idx]
ratio = np.right_shift((ratio1 + 1), 1)

b = np.ones(ratio.shape)
m = np.ones(ratio.shape)

idx = ratio > 0x029a
b[idx] = 0x0
m[idx] = 0x0

idx = ratio <= 0x029a
b[idx] = 0x0018
m[idx] = 0x0012

idx = ratio <= 0x019a
b[idx] = 0x00d2
m[idx] = 0x00fb

idx = ratio <= 0x0138
b[idx] = 0x016f
m[idx] = 0x01fc

idx = ratio <= 0x0100
b[idx] = 0x0270
m[idx] = 0x03fe

idx = ratio <= 0x00c0
b[idx] = 0x023f
m[idx] = 0x037b

idx = ratio <= 0x0080
b[idx] = 0x0214
m[idx] = 0x02d1

idx = ratio <= 0x0040
b[idx] = 0x01f2
m[idx] = 0x01be

temp = channel_zero * b - channel_one * m
temp += (1 << (14 - 1))
tsl2561_data['lux'] = pd.DataFrame(np.right_shift(temp.astype(int), 14), index=tsl2561_data.index)

Acceleration Data (LSM303)

In [8]:
lsm303_accel_data = pd.DataFrame.from_records(data['lsm303_accel'],
                                              columns=['timestamp', 'type', 'accel_x_raw', 'accel_y_raw', 'accel_z_raw', 'gain'])
lsm303_accel_data = fix_timeseries(lsm303_accel_data)
In [9]:
# Calculate engineering values for acceleration from raw acceleration data
col_names = ['accel_x', 'accel_y', 'accel_z']
raw_cols = map(lambda x: "%s_raw" % x, col_names)
lsm303_accel_data[raw_cols] = lsm303_accel_data[raw_cols].astype(int)

# constant from driver/datasheet
sensitivity = 0.244
lsm303_accel_data[col_names] = lsm303_accel_data[raw_cols] * sensitivity / 1000 * 9.81
lsm303_accel_data['accel_magnitude'] = np.sqrt(np.power(lsm303_accel_data.accel_x, 2) + 
                                               np.power(lsm303_accel_data.accel_y, 2) + 
                                               np.power(lsm303_accel_data.accel_z, 2))

Magnetic Data (LSM303)

In [10]:
lsm303_mag_data = pd.DataFrame.from_records(data['lsm303_mag'],
                                            columns=['timestamp', 'type', 'mag_x_raw', 'mag_y_raw', 'mag_z_raw', 'gain'])
lsm303_mag_data = fix_timeseries(lsm303_mag_data)
In [11]:
# Calculate engineering values for magnetic from raw data
col_names = ['mag_x', 'mag_y', 'mag_z']
raw_cols = map(lambda x: "%s_raw" % x, col_names)
lsm303_mag_data[raw_cols] = lsm303_mag_data[raw_cols].astype(int)

# constant from driver/datasheet
sensitivity = 0.160
lsm303_mag_data[col_names] = lsm303_mag_data[raw_cols] * sensitivity * 0.1

Gyro Data (L3GD20)

In [12]:
l3gd20_data = pd.DataFrame.from_records(data['l3gd20'],
                                        columns=['timestamp', 'type', 'gyro_x_raw', 'gyro_y_raw', 'gyro_z_raw', 'gain'])
l3gd20_data = fix_timeseries(l3gd20_data)
In [13]:
# Calculate engineering values from raw data
col_names = ['gyro_x', 'gyro_y', 'gyro_z']
raw_cols = map(lambda x: "%s_raw" % x, col_names)
l3gd20_data[raw_cols] = l3gd20_data[raw_cols].astype(int)

# constant from driver/datasheet
sensitivity = 0.00875
l3gd20_data[col_names] = l3gd20_data[raw_cols] * sensitivity

IR Temperature Data (MLX90614)

In [14]:
mlx90614_data = pd.DataFrame.from_records(data['mlx90614'],
                                          columns=['timestamp', 'type', 'ir_raw', 'ambient_temp_raw'])
mlx90614_data = fix_timeseries(mlx90614_data)
In [15]:
col_names = ['ir', 'ambient_temp']
raw_cols = map(lambda x: "%s_raw" % x, col_names)
mlx90614_data[raw_cols] = mlx90614_data[raw_cols].astype(int)
mlx90614_data[col_names] = mlx90614_data[raw_cols] * 0.02 - 0.02 - 273.15

Arduino Data

The data from the upward pointing sensor array was logged using an Arduino running the ArdusatSDK. Like the OBC data, it was logged using a binary format, then decoded to produce lines of this form:

372,uv,5,3.625232
397,uv,6,0.230535
413,uv,7,0.618744
319,accelerometer,0,78.388870,60.684376,29.900711
321,magnetic,1,-43.984001,-43.040001,48.191998
323,gyro,2,0.003971,0.001222,0.002749
324,temperature,3,-4.450012
325,luminosity,4,60000.000000

The data is slightly different from the OBC data in that the timestamps are in milliseconds since the Arduino started up, instead of Unix epoch time. Because of this, we recorded the epoch time in milliseconds that the Arduino was turned on in order to be able to offset the time series.

Another difference with this dataset is that the values are already in engineering values (rather than raw values), so the only conversion required is to convert the raw strings parsed by the python csv package into float objects.

Resampling the Data

In the raw data, each row in the log file has a slightly different timestamp that corresponds to the exact time that observation was made. Although this is technically the most accurate way to represent the data, it is not as useful to us. When we look at the data, we would like to be able to match up data from different sensors that occured at virtually the same time. In this case, we don't really care about the difference between 372 and 397 milliseconds, but we do care that both occured between 0 and 500 milliseconds. In order to make the data more useful, we resample the data. This means that we divide the data up into blocks of a given frequency (e.g. 500 milliseconds), then take all data observations within each block, average them, and assign the resulting average to the timestamp at the end of the block.

In [16]:
# Time Arduino logger started (ms)
base_time = 1428160637297
In [17]:
# Parse the decoded CSV file to make a list of observations from each
# sensor
from collections import defaultdict
import csv

board_data = defaultdict(list)
with open("./ase_hab/HAB1.csv", "r") as data_file:
    for row in csv.reader(data_file):
         board_data[row[2]].append(tuple([v for i, v in enumerate(row) if i != 2]))
In [18]:
def fix_demosat_timeseries(df):
    """ 
    Fixes up the timeseries by by converting the datatypes, adding the Arduino start
    time as a fixed offset, dropping NaT values and setting the DataFrame index
    """
    ts = pd.to_datetime(df.timestamp.astype(int) + base_time, unit='ms')
    df.timestamp = ts
    df = df.dropna()
    return df.set_index('timestamp')
In [19]:
# Create Pandas DataFrame out of acceleration records & coerce datatypes
up_accel_data = pd.DataFrame.from_records(board_data['0'],
                                          columns=['timestamp', 'type', 'up_accel_x', 'up_accel_y', 'up_accel_z'])
up_accel_data = fix_demosat_timeseries(up_accel_data)
up_accel_data.up_accel_x = up_accel_data.up_accel_x.astype(float)
up_accel_data.up_accel_y = up_accel_data.up_accel_y.astype(float)
up_accel_data.up_accel_z = up_accel_data.up_accel_z.astype(float)
In [20]:
# Create Pandas DataFrame out of magnetic records & coerce datatypes
up_mag_data = pd.DataFrame.from_records(board_data['1'],
                                        columns=['timestamp', 'type', 'up_mag_x', 'up_mag_y', 'up_mag_z'])
up_mag_data = fix_demosat_timeseries(up_mag_data)
up_mag_data.up_mag_x = up_mag_data.up_mag_x.astype(float)
up_mag_data.up_mag_y = up_mag_data.up_mag_y.astype(float)
up_mag_data.up_mag_z = up_mag_data.up_mag_z.astype(float)
In [21]:
# Create Pandas DataFrame out of gyro records & coerce datatypes
up_gyro_data = pd.DataFrame.from_records(board_data['2'],
                                         columns=['timestamp', 'type', 'up_gyro_x', 'up_gyro_y', 'up_gyro_z'])
up_gyro_data = fix_demosat_timeseries(up_gyro_data)
up_gyro_data.up_gyro_x = up_gyro_data.up_gyro_x.astype(float)
up_gyro_data.up_gyro_y = up_gyro_data.up_gyro_y.astype(float)
up_gyro_data.up_gyro_z = up_gyro_data.up_gyro_z.astype(float)
In [22]:
# Create Pandas DataFrame out of IR records & coerce datatypes
up_ir_temp_data = pd.DataFrame.from_records(board_data['3'],
                                         columns=['timestamp', 'type', 'up_ir_temp'])
up_ir_temp_data = fix_demosat_timeseries(up_ir_temp_data)
up_ir_temp_data.up_ir_temp = up_ir_temp_data.up_ir_temp.astype(float)
In [23]:
# Create Pandas DataFrame out of Luminosity records & coerce datatypes
up_lux_data = pd.DataFrame.from_records(board_data['4'],
                                        columns=['timestamp', 'type', 'up_luminosity'])
up_lux_data = fix_demosat_timeseries(up_lux_data)
up_lux_data.up_luminosity = up_lux_data.up_luminosity.astype(float)
In [24]:
# Create Pandas DataFrame out of Luminosity records & coerce datatypes
# There were 3 different UV sensors logged here, so we need to create 3 different 
# DataFrames
h_uv_data = pd.DataFrame.from_records(board_data['5'],
                                      columns=['timestamp', 'type', 'horizon_uv'])
h_uv_data = fix_demosat_timeseries(h_uv_data)
h_uv_data.horizon_uv = h_uv_data.horizon_uv.astype(float)

d_uv_data = pd.DataFrame.from_records(board_data['6'],
                                      columns=['timestamp', 'type', 'down_uv'])
d_uv_data = fix_demosat_timeseries(d_uv_data)
d_uv_data.down_uv = d_uv_data.down_uv.astype(float)

u_uv_data = pd.DataFrame.from_records(board_data['7'],
                                      columns=['timestamp', 'type', 'up_uv'])
u_uv_data = fix_demosat_timeseries(u_uv_data)
u_uv_data.up_uv = u_uv_data.up_uv.astype(float)

# Concatenate all the UV data objects into one dataframe (axis is specified to merge columns)
uv_data = pd.concat((h_uv_data.horizon_uv, d_uv_data.down_uv, u_uv_data.up_uv),
                    axis=1)

# Since each UV data observation has a slightly different datastamp (in milliseconds),
# we resample the dataset to standardize the timeseries at 500ms intervals
uv_data = uv_data.resample('500ms').fillna(method="pad")
In [25]:
# Concatenate all the Arduino data into a single DataFrame
demosat_data = pd.concat((
    up_accel_data,
    up_mag_data,
    up_gyro_data,
    up_ir_temp_data,
    up_lux_data,
    uv_data,
), axis=1).resample('500ms').fillna(method="pad")

Telemetry Data

The High Altitude Balloon had a GPS tracking unit designed to allow the chase team to locate the balloon after landing in order to recover the payload and saved data. This data includes a number of interesting variables, including:

  • Lat/Long (position)
  • Altitude
  • Ground Speed
  • Pressure
  • Relative Humidity

This data is in CSV form, so we just need to load in the data using the pandas.read_csv function, then parse the time columns into a Pandas DateTimeIndex so that we can merge this data with the Ardusat logged data.

In [26]:
# Load in Telemetry data
tel_data = pd.read_csv('./ase_hab/EDGE21_data_csv/FlightData-Table 1.csv')
In [27]:
def make_timestamp(row):
    """Make a single timestamp out of a row of telemetry data"""
    ts = datetime.datetime(2015, 4, 4, row['Hrs'], row['Minutes'], row['Seconds'])
    row['timestamp'] = ts
    return row

# Apply make_timestamp function to each row in the dataset
tel_data = tel_data.apply(make_timestamp, axis=1)

# Create a DateTimeIndex from the raw datetime objects
tel_data.timestamp = pd.to_datetime(tel_data.timestamp)

# Set as the DataFrame index and set the timezone to UTC
tel_data = tel_data.set_index('timestamp')
tel_data = tel_data.tz_localize('utc')
In [28]:
# Clean up column names
tel_data['altitude'] = tel_data['Alt\nm']
tel_data['speed'] = tel_data['Spd\nkts']
tel_data['temp'] = tel_data['Temp\ndeg C']
tel_data['pressure'] = tel_data['Pressure\nkPa']
tel_data['rel_humidity'] = tel_data['R.H.\n%']

# Select out only the data we actually care about
tel_data = tel_data[['altitude', 'speed', 'temp', 'pressure', 'rel_humidity', 'Latitude', 'Longitude']]

Combine all the Data

Now, we need to combine each of the individual DataFrames into one complete dataset. To do this, we'll use the Pandas concat function, and tell it to combine columns. Pandas automatically processes the indexes of each individual DataFrame in order to splice the data together correctly. For this process to work, we need to make sure there are no duplicate index values; the call to drop_duplicates takes care of that for us. Finally, Pandas ensures that timezones of the data match up in addition to the times, so we have to use tz_localize to set the timezone for each DataFrame

In [51]:
obc_data = pd.concat((
    tmp_data.reset_index().drop_duplicates(cols='timestamp', take_last=True).set_index('timestamp').tz_localize('utc'),
    lsm303_accel_data.reset_index().drop_duplicates(cols='timestamp', take_last=True).set_index('timestamp').tz_localize('utc'),
    lsm303_mag_data.reset_index().drop_duplicates(cols='timestamp', take_last=True).set_index('timestamp').tz_localize('utc'),
    l3gd20_data.reset_index().drop_duplicates(cols='timestamp', take_last=True).set_index('timestamp').tz_localize('utc'),
    mlx90614_data.reset_index().drop_duplicates(cols='timestamp', take_last=True).set_index('timestamp').tz_localize('utc'),
    tsl2561_data.reset_index().drop_duplicates(cols='timestamp', take_last=True).set_index('timestamp').tz_localize('utc'),
    tel_data
), axis=1).ix[:'4-4-2015 17:10'].fillna(method="pad")
In [52]:
# The altitude data has some missing points. Rather than backfill the data from the
# last known good points, we can do a linear interpolation on the data to fill in this
# data, since we know the altitude on the climb is roughly linear with respect to time
obc_data.altitude = obc_data.altitude.interpolate()
In [53]:
p = figure(width=500, height=500, title="Altitude (no interpolation)", x_axis_type="datetime")
p1 = figure(width=500, height=500, title="Altitude (interpolation)", x_axis_type="datetime")
df = tel_data.reindex(obc_data.index, method="pad").ix[:'4-4-2015 17:10']
p.line(df.index, df.altitude)
p1.line(obc_data.index, obc_data.altitude, color='red')

show(VBox(HBox(p, p1)))
In [435]:
# Save Data to CSV
obc_data.to_csv('./cleaned_flight_data.csv', header=True)

Plots

There are many interesting insights to be drawn from this data; far more than we can get into here. Here are a few demonstration plots using the excellent (Bokeh)[http://bokeh.pydata.org/] plotting library to get started visualizing the sensor data.

In [56]:
tmp_plot = figure(plot_width=500, plot_height=500, title="Air Temp", 
                  x_axis_type="datetime")
tmp_plot.line(obc_data.index, obc_data.tmp1, color='red')
tmp_plot.line(obc_data.index, obc_data.tmp2, color='blue')
Out[56]:
<bokeh.plotting.Figure at 0x10ea6bb90>
In [69]:
lux_plot = figure(plot_width=500, plot_height=500, title="Luminosity", 
                  x_axis_type="datetime")
lux_plot.line(obc_data.index, obc_data.lux, color='red')
Out[69]:
<bokeh.plotting.Figure at 0x10c157410>
In [70]:
accel_plot = figure(plot_width=500, plot_height=500, title="Acceleration (100 period rolling mean)", 
                  x_axis_type="datetime")

# Because the acceleration data is quite noisy, we'll apply a 100 period moving
# average window to the data to smooth it out.
avg_periods = 100
accel_plot.line(obc_data.index, pd.rolling_mean(obc_data.accel_x, avg_periods), 
                color='red', legend="x")
accel_plot.line(obc_data.index, pd.rolling_mean(obc_data.accel_y, avg_periods),
                color='blue', legend="y")
accel_plot.line(obc_data.index, pd.rolling_mean(obc_data.accel_z, avg_periods),
                color='green', legend="z")
Out[70]:
<bokeh.plotting.Figure at 0x113419550>
In [71]:
mag_plot = figure(plot_width=500, plot_height=500, title="Magnetic Field", 
                  x_axis_type="datetime")
avg_periods = 100
mag_plot.line(obc_data.index, pd.rolling_mean(obc_data.mag_x, avg_periods),
              color='red', legend="x")
mag_plot.line(obc_data.index, pd.rolling_mean(obc_data.mag_y, avg_periods),
              color='blue', legend="y")
mag_plot.line(obc_data.index, pd.rolling_mean(obc_data.mag_z, avg_periods),
              color='green', legend="z")
Out[71]:
<bokeh.plotting.Figure at 0x113419f10>
In [72]:
gyro_plot = figure(plot_width=500, plot_height=500, title="Gyro", 
                  x_axis_type="datetime")
avg_periods = 100
gyro_plot.line(obc_data.index, pd.rolling_mean(obc_data.gyro_x, avg_periods),
               color='red', legend="x")
gyro_plot.line(obc_data.index, pd.rolling_mean(obc_data.gyro_y, avg_periods),
               color='blue', legend="y")
gyro_plot.line(obc_data.index, pd.rolling_mean(obc_data.gyro_z, avg_periods),
               color='green', legend="z")
Out[72]:
<bokeh.plotting.Figure at 0x11341db50>
In [75]:
ir_plot = figure(plot_width=500, plot_height=500, title="IR", 
                  x_axis_type="datetime")
ir_plot.line(obc_data.index, obc_data.ir, color='red')
Out[75]:
<bokeh.plotting.Figure at 0x10e90b350>
In [76]:
show(VBox(
    HBox(tmp_plot, ir_plot),
    HBox(accel_plot, mag_plot),
    HBox(gyro_plot, lux_plot)
))